import pandas as pd
import numpy as np
import gzip
import json
import plotly.io as pio
pio.renderers.default='notebook'
%%HTML
<script src="require.js"></script>
meta_df_1000 = pd.read_parquet('meta_df_1000_cleaned.parquet')
reviews_df = pd.read_parquet('reviews_df_clean.parquet')
categories_arr1 = meta_df_1000.category.apply(lambda x: x[1])
categories_arr2 = meta_df_1000.category.apply(lambda x: x[2] if len(x)>=3 else None)
categories_arr1
158 eBook Readers & Accessories
243 Accessories & Supplies
947 Accessories & Supplies
1336 Computers & Accessories
2327 Accessories & Supplies
...
780642 Headphones
781908 Computers & Accessories
782902 Computers & Accessories
783568 Car & Vehicle Electronics
785453 Computers & Accessories
Name: category, Length: 786, dtype: object
import plotly.express as px
fig = px.histogram(categories_arr1, x='category', text_auto=True)
fig.update_layout(
title_text='Frequencies of 1st sub-categories within Electronics'
)
fig.show()
merged_df = pd.merge(meta_df_1000, reviews_df, on='asin', how='inner')
merged_df['category_1'] = merged_df.category.apply(lambda x: x[1])
merged_df['rating'] = merged_df['rating'].astype(float)
fig = px.histogram(merged_df, x='category_1', y='rating', histfunc='avg', text_auto=True)
fig.update_layout(
title_text='Mean rating over categories for all subcategories within Electronics'
)
fig.show()
product_mean_rating = merged_df.groupby('asin')\
.agg(mean_rating=('rating', 'mean'), count=('asin', 'count'))\
.reset_index()
product_mean_rating = product_mean_rating.sort_values('count')
product_mean_rating
| asin | mean_rating | count | |
|---|---|---|---|
| 649 | B00Y86CJ1A | 5.00 | 1 |
| 150 | B000Y1NES0 | 5.00 | 1 |
| 152 | B000YZ63MK | 2.00 | 1 |
| 490 | B00IML19MC | 5.00 | 1 |
| 155 | B0011FZB52 | 5.00 | 1 |
| ... | ... | ... | ... |
| 569 | B00OBTO8EA | 3.90 | 20 |
| 570 | B00ODEU0PY | 4.40 | 20 |
| 203 | B0022NHQB4 | 3.85 | 20 |
| 229 | B0039NM5SK | 4.45 | 20 |
| 775 | B01HEKL4KI | 2.60 | 20 |
776 rows × 3 columns
fig = px.scatter(product_mean_rating, x='count', y='mean_rating', trendline="ols")
fig.update_layout(
title_text='Product Mean rating vs Count of product reviews in the dataset'
)
!jupyter nbconvert --to html visualization.ipynb
[NbConvertApp] Converting notebook visualization.ipynb to html [NbConvertApp] Writing 819185 bytes to visualization.html
fig.write_html('plot1.html',
full_html=False,
include_plotlyjs='cdn')
px.scatter(merged_df, x="rating", y='found_helpful', trendline='ols')
fig = px.density_heatmap(merged_df, x="verified_purchase", y="category_1", z="rating", histfunc="avg", marginal_x="histogram", marginal_y="histogram", text_auto=True)
fig.update_layout(
title_text='Avg ratings for each category and verified purchase combos'
)
fig.show()
meta_df_1000['rank']
232 {'Amazon Launchpad ': None, 'Amazon Launchpad ...
738 {'Amazon Launchpad ': None, 'Amazon Launchpad ...
2385 {'Amazon Launchpad ': None, 'Amazon Launchpad ...
2573 {'Amazon Launchpad ': None, 'Amazon Launchpad ...
3269 {'Amazon Launchpad ': None, 'Amazon Launchpad ...
...
783492 {'Amazon Launchpad ': None, 'Amazon Launchpad ...
783763 {'Amazon Launchpad ': None, 'Amazon Launchpad ...
784274 {'Amazon Launchpad ': None, 'Amazon Launchpad ...
784963 {'Amazon Launchpad ': None, 'Amazon Launchpad ...
785097 {'Amazon Launchpad ': None, 'Amazon Launchpad ...
Name: rank, Length: 799, dtype: object
rank_df = merged_df[['asin', 'rank', 'rating', 'category_1']].explode('rank')
rank_df = rank_df.dropna(subset='rank').reset_index(drop=True)
rank_df
| asin | rank | rating | category_1 | |
|---|---|---|---|---|
| 0 | 1039869017 | [Computers & Accessories > Tablet Accessories ... | 5.0 | Computers & Accessories |
| 1 | 1039869017 | [Computers & Accessories > Tablet Accessories ... | 5.0 | Computers & Accessories |
| 2 | 1944288023 | [Cell Phones & Accessories , 1,053,995] | 5.0 | Headphones |
| 3 | 1944288023 | [Cell Phones & Accessories > Cell Phone Access... | 5.0 | Headphones |
| 4 | 1944288023 | [Electronics > Home Audio & Theater, 153,549] | 5.0 | Headphones |
| ... | ... | ... | ... | ... |
| 10776 | B01HEKL4KI | [Electronics > Car Electronics > Car Video > O... | 3.0 | Car & Vehicle Electronics |
| 10777 | B01HEKL4KI | [Electronics > Car Electronics > Car Audio, 9,... | 3.0 | Car & Vehicle Electronics |
| 10778 | B01HEKL4KI | [Electronics , 163,429] | 3.0 | Car & Vehicle Electronics |
| 10779 | B01HEKL4KI | [Electronics > Car Electronics > Car Video > O... | 3.0 | Car & Vehicle Electronics |
| 10780 | B01HEKL4KI | [Electronics > Car Electronics > Car Audio, 9,... | 3.0 | Car & Vehicle Electronics |
10781 rows × 4 columns
rank_df = pd.concat([rank_df.drop(columns='rank'), pd.DataFrame(rank_df['rank'].to_list(), columns=['category', 'rank'])], axis=1)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In [200], line 1 ----> 1 rank_df = pd.concat([rank_df.drop(columns='rank'), pd.DataFrame(rank_df['rank'].to_list(), columns=['category', 'rank'])], axis=1) 2 rank_df['rank'] = rank_df['rank'].str.replace(',', '', regex=False) File ~/miniforge3/lib/python3.10/site-packages/pandas/core/frame.py:761, in DataFrame.__init__(self, data, index, columns, dtype, copy) 753 mgr = arrays_to_mgr( 754 arrays, 755 columns, (...) 758 typ=manager, 759 ) 760 else: --> 761 mgr = ndarray_to_mgr( 762 data, 763 index, 764 columns, 765 dtype=dtype, 766 copy=copy, 767 typ=manager, 768 ) 769 else: 770 mgr = dict_to_mgr( 771 {}, 772 index, (...) 775 typ=manager, 776 ) File ~/miniforge3/lib/python3.10/site-packages/pandas/core/internals/construction.py:349, in ndarray_to_mgr(values, index, columns, dtype, copy, typ) 344 # _prep_ndarraylike ensures that values.ndim == 2 at this point 345 index, columns = _get_axes( 346 values.shape[0], values.shape[1], index=index, columns=columns 347 ) --> 349 _check_values_indices_shape_match(values, index, columns) 351 if typ == "array": 353 if issubclass(values.dtype.type, str): File ~/miniforge3/lib/python3.10/site-packages/pandas/core/internals/construction.py:420, in _check_values_indices_shape_match(values, index, columns) 418 passed = values.shape 419 implied = (len(index), len(columns)) --> 420 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}") ValueError: Shape of passed values is (10781, 1), indices imply (10781, 2)
rank_df['rank'] = rank_df['rank'].str.replace(',', '', regex=False).astype(float)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In [202], line 1 ----> 1 rank_df['rank'] = rank_df['rank'].str.replace(',', '', regex=False).astype(float) 2 rank_df File ~/miniforge3/lib/python3.10/site-packages/pandas/core/generic.py:5902, in NDFrame.__getattr__(self, name) 5895 if ( 5896 name not in self._internal_names_set 5897 and name not in self._metadata 5898 and name not in self._accessors 5899 and self._info_axis._can_hold_identifiers_and_holds_name(name) 5900 ): 5901 return self[name] -> 5902 return object.__getattribute__(self, name) File ~/miniforge3/lib/python3.10/site-packages/pandas/core/accessor.py:182, in CachedAccessor.__get__(self, obj, cls) 179 if obj is None: 180 # we're accessing the attribute of the class, i.e., Dataset.geo 181 return self._accessor --> 182 accessor_obj = self._accessor(obj) 183 # Replace the property with the accessor object. Inspired by: 184 # https://www.pydanny.com/cached-property.html 185 # We need to use object.__setattr__ because we overwrite __setattr__ on 186 # NDFrame 187 object.__setattr__(obj, self._name, accessor_obj) File ~/miniforge3/lib/python3.10/site-packages/pandas/core/strings/accessor.py:181, in StringMethods.__init__(self, data) 178 def __init__(self, data) -> None: 179 from pandas.core.arrays.string_ import StringDtype --> 181 self._inferred_dtype = self._validate(data) 182 self._is_categorical = is_categorical_dtype(data.dtype) 183 self._is_string = isinstance(data.dtype, StringDtype) File ~/miniforge3/lib/python3.10/site-packages/pandas/core/strings/accessor.py:235, in StringMethods._validate(data) 232 inferred_dtype = lib.infer_dtype(values, skipna=True) 234 if inferred_dtype not in allowed_types: --> 235 raise AttributeError("Can only use .str accessor with string values!") 236 return inferred_dtype AttributeError: Can only use .str accessor with string values!
rank_df
| asin | rating | category_1 | category | rank | |
|---|---|---|---|---|---|
| 0 | 1039869017 | 5.0 | Computers & Accessories | Computers & Accessories > Tablet Accessories >... | 151274.0 |
| 1 | 1039869017 | 5.0 | Computers & Accessories | Computers & Accessories > Tablet Accessories >... | 151274.0 |
| 2 | 1944288023 | 5.0 | Headphones | Cell Phones & Accessories | 1053995.0 |
| 3 | 1944288023 | 5.0 | Headphones | Cell Phones & Accessories > Cell Phone Accesso... | 56064.0 |
| 4 | 1944288023 | 5.0 | Headphones | Electronics > Home Audio & Theater | 153549.0 |
| ... | ... | ... | ... | ... | ... |
| 10776 | B01HEKL4KI | 3.0 | Car & Vehicle Electronics | Electronics > Car Electronics > Car Video > On | 1532.0 |
| 10777 | B01HEKL4KI | 3.0 | Car & Vehicle Electronics | Electronics > Car Electronics > Car Audio | 9997.0 |
| 10778 | B01HEKL4KI | 3.0 | Car & Vehicle Electronics | Electronics | 163429.0 |
| 10779 | B01HEKL4KI | 3.0 | Car & Vehicle Electronics | Electronics > Car Electronics > Car Video > On | 1532.0 |
| 10780 | B01HEKL4KI | 3.0 | Car & Vehicle Electronics | Electronics > Car Electronics > Car Audio | 9997.0 |
10781 rows × 5 columns
fig = px.box(rank_df[rank_df['rank'] < 1000000], x="category_1", y="rank", color="rating",
category_orders={'rating': [5.0, 4.0, 3.0, 2.0, 1.0]})
fig.show()
fig = px.box(rank_df[rank_df['rank'] < 500000], x="rating", y="rank", color='category_1',
category_orders={'rating': [5.0, 4.0, 3.0, 2.0, 1.0]})
fig.show()
import nltk
import spacy
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib import rcParams
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline
nlp = spacy.load('en_core_web_sm')
0 (This, cover, was, an, amazing, deal, ,, not, ...
1 (Works, great, ., Has, a, obviously, fake, lea...
2 (This, may, be, a, kids, book, ,, but, I, sure...
3 (Such, a, great, book, !, The, illustrations, ...
4 (Whimsical, ,, poignant, ,, a, breath, of, fre...
...
5892 (Its, good, for, they, price, ,, i, just, wish...
5893 (I, like, this, product, ,, but, it, is, hard,...
5894 (This, little, DVR, is, Ok, for, the, money, d...
5895 (Play, back, just, give, picture, not, video)
5896 (Difficult, to, figure, out, for, an, old, per...
Name: tokens, Length: 5897, dtype: object
merged_df['tokens'] = merged_df['content'].apply(lambda x: nlp(x.lower()))
merged_df.tokens
0 (this, cover, was, an, amazing, deal, ,, not, ...
1 (works, great, ., has, a, obviously, fake, lea...
2 (this, may, be, a, kids, book, ,, but, i, sure...
3 (such, a, great, book, !, the, illustrations, ...
4 (whimsical, ,, poignant, ,, a, breath, of, fre...
...
5892 (its, good, for, they, price, ,, i, just, wish...
5893 (i, like, this, product, ,, but, it, is, hard,...
5894 (this, little, dvr, is, ok, for, the, money, d...
5895 (play, back, just, give, picture, not, video)
5896 (difficult, to, figure, out, for, an, old, per...
Name: tokens, Length: 5897, dtype: object
merged_df['tokens'] = merged_df.tokens.apply(lambda x: [w.lemma_ for w in x if not w.is_stop and not w.is_punct and w.lemma_!=' '])
# for w in merged_df.tokens[0]:
# print (w, w.lemma_, w.is_stop, w.is_punct)
word_freq_df = merged_df[['category_1', 'tokens']].explode('tokens').reset_index(drop=True)\
.groupby(['category_1', 'tokens']).agg(count=('category_1', 'count')).reset_index().sort_values('count', ascending=False)
word_freq_df[word_freq_df['tokens']=='one']
| category_1 | tokens | count | |
|---|---|---|---|
| 16711 | Computers & Accessories | one | 47 |
| 23841 | Headphones | one | 39 |
| 7076 | Camera & Photo | one | 28 |
| 2297 | Accessories & Supplies | one | 20 |
| 10563 | Car & Vehicle Electronics | one | 9 |
| 28871 | Portable Audio & Video | one | 6 |
| 30281 | Security & Surveillance | one | 2 |
| 31974 | Television & Video | one | 2 |
| 26672 | Home Audio | one | 2 |
| 20935 | GPS, Finders & Accessories | one | 1 |
tokens_df = merged_df[['category_1', 'tokens']].copy()
tokens_df['tokens'] = tokens_df['tokens'].apply(lambda x: ' '.join(x))
category_tokens = tokens_df.groupby('category_1').tokens.agg(lambda x: ' '.join(x.to_list()))
category_tokens.index
Index(['Accessories & Supplies', 'Camera & Photo', 'Car & Vehicle Electronics',
'Computers & Accessories', 'Electronics Warranties',
'GPS, Finders & Accessories', 'Headphones', 'Home Audio',
'Portable Audio & Video', 'Security & Surveillance', 'Service Plans',
'Television & Video', 'Video Projectors',
'eBook Readers & Accessories'],
dtype='object', name='category_1')
wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white", max_words=1000, min_font_size=2).generate(' '.join(merged_df.content.to_list()))
for cat in category_tokens.index:
print (cat)
wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white", max_words=1000,
width=1000, height=1000).generate(category_tokens[cat])
rcParams['figure.figsize'] = 50,50
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
Accessories & Supplies
Camera & Photo
Car & Vehicle Electronics
Computers & Accessories
Electronics Warranties
GPS, Finders & Accessories
Headphones
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) File ~/miniforge3/lib/python3.10/site-packages/PIL/ImageDraw.py:463, in ImageDraw.text.<locals>.draw_text(ink, stroke_width, stroke_offset) 462 try: --> 463 mask, offset = font.getmask2( 464 text, 465 mode, 466 direction=direction, 467 features=features, 468 language=language, 469 stroke_width=stroke_width, 470 anchor=anchor, 471 ink=ink, 472 start=start, 473 *args, 474 **kwargs, 475 ) 476 coord = coord[0] + offset[0], coord[1] + offset[1] AttributeError: 'TransposedFont' object has no attribute 'getmask2' During handling of the above exception, another exception occurred: KeyboardInterrupt Traceback (most recent call last) Cell In [321], line 6 3 wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white", max_words=1000, 4 width=1000, height=1000).generate(category_tokens[cat]) 5 rcParams['figure.figsize'] = 30,30 ----> 6 plt.imshow(wordcloud) 7 plt.axis("off") 8 plt.show() File ~/miniforge3/lib/python3.10/site-packages/matplotlib/_api/deprecation.py:454, in make_keyword_only.<locals>.wrapper(*args, **kwargs) 448 if len(args) > name_idx: 449 warn_deprecated( 450 since, message="Passing the %(name)s %(obj_type)s " 451 "positionally is deprecated since Matplotlib %(since)s; the " 452 "parameter will become keyword-only %(removal)s.", 453 name=name, obj_type=f"parameter of {func.__name__}()") --> 454 return func(*args, **kwargs) File ~/miniforge3/lib/python3.10/site-packages/matplotlib/pyplot.py:2631, in imshow(X, cmap, norm, aspect, interpolation, alpha, vmin, vmax, origin, extent, interpolation_stage, filternorm, filterrad, resample, url, data, **kwargs) 2625 @_copy_docstring_and_deprecators(Axes.imshow) 2626 def imshow( 2627 X, cmap=None, norm=None, aspect=None, interpolation=None, 2628 alpha=None, vmin=None, vmax=None, origin=None, extent=None, *, 2629 interpolation_stage=None, filternorm=True, filterrad=4.0, 2630 resample=None, url=None, data=None, **kwargs): -> 2631 __ret = gca().imshow( 2632 X, cmap=cmap, norm=norm, aspect=aspect, 2633 interpolation=interpolation, alpha=alpha, vmin=vmin, 2634 vmax=vmax, origin=origin, extent=extent, 2635 interpolation_stage=interpolation_stage, 2636 filternorm=filternorm, filterrad=filterrad, resample=resample, 2637 url=url, **({"data": data} if data is not None else {}), 2638 **kwargs) 2639 sci(__ret) 2640 return __ret File ~/miniforge3/lib/python3.10/site-packages/matplotlib/_api/deprecation.py:454, in make_keyword_only.<locals>.wrapper(*args, **kwargs) 448 if len(args) > name_idx: 449 warn_deprecated( 450 since, message="Passing the %(name)s %(obj_type)s " 451 "positionally is deprecated since Matplotlib %(since)s; the " 452 "parameter will become keyword-only %(removal)s.", 453 name=name, obj_type=f"parameter of {func.__name__}()") --> 454 return func(*args, **kwargs) File ~/miniforge3/lib/python3.10/site-packages/matplotlib/__init__.py:1433, in _preprocess_data.<locals>.inner(ax, data, *args, **kwargs) 1430 @functools.wraps(func) 1431 def inner(ax, *args, data=None, **kwargs): 1432 if data is None: -> 1433 return func(ax, *map(sanitize_sequence, args), **kwargs) 1435 bound = new_sig.bind(ax, *args, **kwargs) 1436 auto_label = (bound.arguments.get(label_namer) 1437 or bound.kwargs.get(label_namer)) File ~/miniforge3/lib/python3.10/site-packages/matplotlib/axes/_axes.py:5610, in Axes.imshow(self, X, cmap, norm, aspect, interpolation, alpha, vmin, vmax, origin, extent, interpolation_stage, filternorm, filterrad, resample, url, **kwargs) 5602 self.set_aspect(aspect) 5603 im = mimage.AxesImage(self, cmap=cmap, norm=norm, 5604 interpolation=interpolation, origin=origin, 5605 extent=extent, filternorm=filternorm, 5606 filterrad=filterrad, resample=resample, 5607 interpolation_stage=interpolation_stage, 5608 **kwargs) -> 5610 im.set_data(X) 5611 im.set_alpha(alpha) 5612 if im.get_clip_path() is None: 5613 # image does not already have clipping set, clip to axes patch File ~/miniforge3/lib/python3.10/site-packages/matplotlib/image.py:697, in _ImageBase.set_data(self, A) 695 if isinstance(A, PIL.Image.Image): 696 A = pil_to_array(A) # Needed e.g. to apply png palette. --> 697 self._A = cbook.safe_masked_invalid(A, copy=True) 699 if (self._A.dtype != np.uint8 and 700 not np.can_cast(self._A.dtype, float, "same_kind")): 701 raise TypeError("Image data of dtype {} cannot be converted to " 702 "float".format(self._A.dtype)) File ~/miniforge3/lib/python3.10/site-packages/matplotlib/cbook/__init__.py:743, in safe_masked_invalid(x, copy) 742 def safe_masked_invalid(x, copy=False): --> 743 x = np.array(x, subok=True, copy=copy) 744 if not x.dtype.isnative: 745 # If we have already made a copy, do the byteswap in place, else make a 746 # copy with the byte order swapped. 747 x = x.byteswap(inplace=copy).newbyteorder('N') # Swap to native order. File ~/miniforge3/lib/python3.10/site-packages/wordcloud/wordcloud.py:747, in WordCloud.__array__(self) 739 def __array__(self): 740 """Convert to numpy array. 741 742 Returns (...) 745 Word cloud image as numpy matrix. 746 """ --> 747 return self.to_array() File ~/miniforge3/lib/python3.10/site-packages/wordcloud/wordcloud.py:737, in WordCloud.to_array(self) 729 def to_array(self): 730 """Convert to numpy array. 731 732 Returns (...) 735 Word cloud image as numpy matrix. 736 """ --> 737 return np.array(self.to_image()) File ~/miniforge3/lib/python3.10/site-packages/wordcloud/wordcloud.py:666, in WordCloud.to_image(self) 662 transposed_font = ImageFont.TransposedFont( 663 font, orientation=orientation) 664 pos = (int(position[1] * self.scale), 665 int(position[0] * self.scale)) --> 666 draw.text(pos, word, fill=color, font=transposed_font) 668 return self._draw_contour(img=img) File ~/miniforge3/lib/python3.10/site-packages/PIL/ImageDraw.py:520, in ImageDraw.text(self, xy, text, fill, font, anchor, spacing, align, direction, features, language, stroke_width, stroke_fill, embedded_color, *args, **kwargs) 517 draw_text(ink, 0) 518 else: 519 # Only draw normal text --> 520 draw_text(ink) File ~/miniforge3/lib/python3.10/site-packages/PIL/ImageDraw.py:479, in ImageDraw.text.<locals>.draw_text(ink, stroke_width, stroke_offset) 477 except AttributeError: 478 try: --> 479 mask = font.getmask( 480 text, 481 mode, 482 direction, 483 features, 484 language, 485 stroke_width, 486 anchor, 487 ink, 488 start=start, 489 *args, 490 **kwargs, 491 ) 492 except TypeError: 493 mask = font.getmask(text) File ~/miniforge3/lib/python3.10/site-packages/PIL/ImageFont.py:908, in TransposedFont.getmask(self, text, mode, *args, **kwargs) 907 def getmask(self, text, mode="", *args, **kwargs): --> 908 im = self.font.getmask(text, mode, *args, **kwargs) 909 if self.orientation is not None: 910 return im.transpose(self.orientation) File ~/miniforge3/lib/python3.10/site-packages/PIL/ImageFont.py:665, in FreeTypeFont.getmask(self, text, mode, direction, features, language, stroke_width, anchor, ink, start) 587 def getmask( 588 self, 589 text, (...) 597 start=None, 598 ): 599 """ 600 Create a bitmap for the text. 601 (...) 663 :py:mod:`PIL.Image.core` interface module. 664 """ --> 665 return self.getmask2( 666 text, 667 mode, 668 direction=direction, 669 features=features, 670 language=language, 671 stroke_width=stroke_width, 672 anchor=anchor, 673 ink=ink, 674 start=start, 675 )[0] File ~/miniforge3/lib/python3.10/site-packages/PIL/ImageFont.py:778, in FreeTypeFont.getmask2(self, text, mode, fill, direction, features, language, stroke_width, anchor, ink, start, *args, **kwargs) 776 Image._decompression_bomb_check(size) 777 im = fill("RGBA" if mode == "RGBA" else "L", size, 0) --> 778 self.font.render( 779 text, 780 im.id, 781 mode, 782 direction, 783 features, 784 language, 785 stroke_width, 786 ink, 787 start[0], 788 start[1], 789 ) 790 return im, offset KeyboardInterrupt:
tokens_df = merged_df[['rating', 'tokens']].copy()
tokens_df['tokens'] = tokens_df['tokens'].apply(lambda x: ' '.join(x))
rating_tokens = tokens_df.groupby('rating').tokens.agg(lambda x: ' '.join(x.to_list()))
rating_tokens
rating 1.0 recently buy wife acura 3.2 tl navigation syst... 2.0 product lack want ill stick laptop like compac... 3.0 hi clock setting brightness number big light f... 4.0 buy streetpilot move new city want lose soon b... 5.0 cover amazing deal fit perfect protect cheaply... Name: tokens, dtype: object
for rat in rating_tokens.index:
print (rat)
wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white", max_words=1000,
width=1000, height=1000).generate(rating_tokens[rat])
rcParams['figure.figsize'] = 50,50
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
1.0
2.0
3.0
4.0
5.0
stopwords = STOPWORDS.update(['work', 'time', 'buy', 'product','use', 'camera'])
for cat in category_tokens.index:
print (cat)
wordcloud = WordCloud(stopwords=STOPWORDS, relative_scaling=0.3, background_color="white", max_words=1000,
width=1000, height=1000).generate(category_tokens[cat])
rcParams['figure.figsize'] = 50,50
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
1.0
2.0
3.0
4.0
5.0
#most frequent words for each rating
# common words in each category
#wordmap
df = pd.DataFrame({
"a" : [1,2,3],
"b" : [[1,2],[2,3,4],[5]],
"c" : [5,6,7]
})
df
| a | b | c | |
|---|---|---|---|
| 0 | 1 | [1, 2] | 5 |
| 1 | 2 | [2, 3, 4] | 6 |
| 2 | 3 | [5] | 7 |
df.b.str.len()
0 2 1 3 2 1 Name: b, dtype: int64
freq_brands_15 = merged_df.brand.value_counts().sort_values(ascending=False)[:15].reset_index().rename(columns={'index': 'brand_name', 'brand': 'counts'})
freq_brands_15
| brand_name | counts | |
|---|---|---|
| 0 | Sony | 161 |
| 1 | Panasonic | 89 |
| 2 | Samsung | 76 |
| 3 | Canon | 72 |
| 4 | Dell | 56 |
| 5 | HP | 55 |
| 6 | JVC | 49 |
| 7 | StarTech | 48 |
| 8 | Acer | 47 |
| 9 | Lenovo | 47 |
| 10 | Koss | 44 |
| 11 | uxcell | 43 |
| 12 | Kodak | 41 |
| 13 | JJC | 40 |
| 14 | Lexar | 40 |
px.bar(freq_brands_15, x='brand_name', y='counts', text_auto=True)
brand_rating_15 = merged_df.loc[merged_df.brand.isin(freq_brands_15['brand_name']), ['brand', 'rating']]
px.box(brand_rating_15, x='brand', y='rating')
brand_rating_freq = brand_rating_15.groupby(['brand', 'rating']).agg(count=('brand', 'count')).reset_index()
brand_rating_freq
| brand | rating | count | |
|---|---|---|---|
| 0 | Acer | 1.0 | 6 |
| 1 | Acer | 2.0 | 4 |
| 2 | Acer | 3.0 | 6 |
| 3 | Acer | 4.0 | 17 |
| 4 | Acer | 5.0 | 14 |
| ... | ... | ... | ... |
| 69 | uxcell | 1.0 | 5 |
| 70 | uxcell | 2.0 | 1 |
| 71 | uxcell | 3.0 | 8 |
| 72 | uxcell | 4.0 | 16 |
| 73 | uxcell | 5.0 | 13 |
74 rows × 3 columns
px.scatter(brand_rating_freq, x='brand', y='rating', size='count')
category_verified_pur = merged_df[['category_1', 'verified_purchase']].value_counts().rename('counts').reset_index()
category_verified_pur
| category_1 | verified_purchase | counts | |
|---|---|---|---|
| 0 | Computers & Accessories | True | 2074 |
| 1 | Camera & Photo | True | 1081 |
| 2 | Accessories & Supplies | True | 939 |
| 3 | Headphones | True | 290 |
| 4 | Car & Vehicle Electronics | True | 273 |
| 5 | Computers & Accessories | False | 249 |
| 6 | Portable Audio & Video | True | 234 |
| 7 | Home Audio | True | 155 |
| 8 | Camera & Photo | False | 123 |
| 9 | Security & Surveillance | True | 78 |
| 10 | GPS, Finders & Accessories | True | 76 |
| 11 | Television & Video | False | 55 |
| 12 | Portable Audio & Video | False | 41 |
| 13 | Television & Video | True | 38 |
| 14 | Accessories & Supplies | False | 32 |
| 15 | Headphones | False | 29 |
| 16 | Home Audio | False | 26 |
| 17 | GPS, Finders & Accessories | False | 18 |
| 18 | Car & Vehicle Electronics | False | 17 |
| 19 | Video Projectors | False | 15 |
| 20 | Service Plans | True | 13 |
| 21 | Electronics Warranties | False | 13 |
| 22 | Security & Surveillance | False | 8 |
| 23 | Service Plans | False | 7 |
| 24 | Video Projectors | True | 6 |
| 25 | Electronics Warranties | True | 4 |
| 26 | eBook Readers & Accessories | True | 3 |
# merged_df
fig = px.sunburst(
category_verified_pur,
path=['category_1', 'verified_purchase'],
values='counts'
)
fig.show()
merged_df
| category | description | title_x | also_buy | brand | feature | rank | also_view | main_cat | date_x | ... | asin | title_y | content | date_y | author | rating | found_helpful | verified_purchase | product | category_1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | [Electronics, Computers & Accessories, Tablet ... | Brand new and high quality\nLightweight soft c... | PU Leather 360 Degree Rotating Stand Case Cove... | [] | new brand | Brand new and high quality Lightweight soft ca... | [[Computers & Accessories > Tablet Accessories... | [] | Computers | 2012-11-23 | ... | 1039869017 | love it!! | This cover was an amazing deal, not only does ... | 2013-03-08 | lorena mahoney | 5.0 | 1 | True | Yellow Protective Silicone Gel Slim Thin Back ... | Computers & Accessories |
| 1 | [Electronics, Computers & Accessories, Tablet ... | Brand new and high quality\nLightweight soft c... | PU Leather 360 Degree Rotating Stand Case Cove... | [] | new brand | Brand new and high quality Lightweight soft ca... | [[Computers & Accessories > Tablet Accessories... | [] | Computers | 2012-11-23 | ... | 1039869017 | Works fine | Works great. Has a obviously fake leather exte... | 2013-09-05 | Alex P | 5.0 | 0 | True | Yellow Protective Silicone Gel Slim Thin Back ... | Computers & Accessories |
| 2 | [Electronics, Headphones] | This beautifully illustrated children's book t... | The Legend of the Starfish | [] | The Joy Market | [[Cell Phones & Accessories , 1,053,995], [Cel... | [] | Cell Phones & Accessories | NaT | ... | 1944288023 | but I sure enjoyed it myself | This may be a kids book, but I sure enjoyed it... | 2016-04-22 | RedCurlz | 5.0 | 0 | False | The Legend of the Starfish | Headphones | |
| 3 | [Electronics, Headphones] | This beautifully illustrated children's book t... | The Legend of the Starfish | [] | The Joy Market | [[Cell Phones & Accessories , 1,053,995], [Cel... | [] | Cell Phones & Accessories | NaT | ... | 1944288023 | Such a great book! The illustrations are beaut... | Such a great book! The illustrations are beaut... | 2016-04-19 | Micah Wood | 5.0 | 1 | False | The Legend of the Starfish | Headphones | |
| 4 | [Electronics, Headphones] | This beautifully illustrated children's book t... | The Legend of the Starfish | [] | The Joy Market | [[Cell Phones & Accessories , 1,053,995], [Cel... | [] | Cell Phones & Accessories | NaT | ... | 1944288023 | Add to your library! | Whimsical, poignant, a breath of fresh air. A ... | 2016-04-19 | KAT | 5.0 | 1 | False | The Legend of the Starfish | Headphones | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5892 | [Electronics, Car & Vehicle Electronics, Car E... | Feature : G-sensor function and motion detecti... | Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7&... | [] | Lecmal | 3\n3\n3\n3\n3 | [[Electronics , 163,429], [Electronics > Car E... | [] | Car Electronics | 2016-06-22 | ... | B01HEKL4KI | Its good for they price | Its good for they price, i just wish it had a ... | 2016-08-29 | Remo | 4.0 | 0 | True | Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7"... | Car & Vehicle Electronics |
| 5893 | [Electronics, Car & Vehicle Electronics, Car E... | Feature : G-sensor function and motion detecti... | Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7&... | [] | Lecmal | 3\n3\n3\n3\n3 | [[Electronics , 163,429], [Electronics > Car E... | [] | Car Electronics | 2016-06-22 | ... | B01HEKL4KI | I like this product | I like this product , but it is hard to use af... | 2017-11-25 | Donna B | 4.0 | 0 | True | Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7"... | Car & Vehicle Electronics |
| 5894 | [Electronics, Car & Vehicle Electronics, Car E... | Feature : G-sensor function and motion detecti... | Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7&... | [] | Lecmal | 3\n3\n3\n3\n3 | [[Electronics , 163,429], [Electronics > Car E... | [] | Car Electronics | 2016-06-22 | ... | B01HEKL4KI | ... DVR is Ok for the money daytime it works g... | This little DVR is Ok for the money daytime it... | 2016-08-20 | Hawk | 4.0 | 2 | True | Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7"... | Car & Vehicle Electronics |
| 5895 | [Electronics, Car & Vehicle Electronics, Car E... | Feature : G-sensor function and motion detecti... | Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7&... | [] | Lecmal | 3\n3\n3\n3\n3 | [[Electronics , 163,429], [Electronics > Car E... | [] | Car Electronics | 2016-06-22 | ... | B01HEKL4KI | Three Stars | Play back just give picture not video | 2017-01-10 | Fred Maragheh | 3.0 | 0 | True | Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7"... | Car & Vehicle Electronics |
| 5896 | [Electronics, Car & Vehicle Electronics, Car E... | Feature : G-sensor function and motion detecti... | Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7&... | [] | Lecmal | 3\n3\n3\n3\n3 | [[Electronics , 163,429], [Electronics > Car E... | [] | Car Electronics | 2016-06-22 | ... | B01HEKL4KI | Three Stars | Difficult to figure out for an old person. | 2016-08-30 | bamatutz | 3.0 | 0 | True | Lecmal ix-se3-2 GS8000 DVR Recorder 1080P/2.7"... | Car & Vehicle Electronics |
5897 rows × 21 columns
merged_df.found_helpful.value_counts()
0 3570
1 947
2 420
3 234
4 133
...
89 1
118 1
75 1
65 1
275 1
Name: found_helpful, Length: 75, dtype: int64
px.scatter(merged_df, color='rating', y='found_helpful', x='category_1')
helpful_votes = merged_df.loc[merged_df.found_helpful != 0, ['category_1', 'rating', 'found_helpful']].copy()
helpful_votes['found_helpful_bins'] = pd.cut(helpful_votes.found_helpful, [0,1,2,3,4,6,11,399])
helpful_votes = helpful_votes.drop(columns='found_helpful').value_counts().rename('count').reset_index()
helpful_votes['found_helpful_bins'] = helpful_votes['found_helpful_bins'].astype(str)
helpful_votes['rating'] = helpful_votes['rating'].astype(str)
helpful_votes
| category_1 | rating | found_helpful_bins | count | raring | |
|---|---|---|---|---|---|
| 0 | Computers & Accessories | 5.0 | (0, 1] | 163 | 5.0 |
| 1 | Accessories & Supplies | 5.0 | (0, 1] | 106 | 5.0 |
| 2 | Computers & Accessories | 4.0 | (0, 1] | 88 | 4.0 |
| 3 | Camera & Photo | 5.0 | (0, 1] | 82 | 5.0 |
| 4 | Computers & Accessories | 5.0 | (1, 2] | 77 | 5.0 |
| ... | ... | ... | ... | ... | ... |
| 290 | Home Audio | 1.0 | (2, 3] | 1 | 1.0 |
| 291 | Headphones | 4.0 | (3, 4] | 1 | 4.0 |
| 292 | Headphones | 3.0 | (4, 6] | 1 | 3.0 |
| 293 | Headphones | 2.0 | (11, 399] | 1 | 2.0 |
| 294 | Video Projectors | 5.0 | (11, 399] | 1 | 5.0 |
295 rows × 5 columns
px.scatter(helpful_votes, x='rating', y='found_helpful_bins', color='category_1', size='count',
category_orders={'rating': [1.0, 2.0, 3.0, 4.0, 5.0],
'found_helpful_bins': ['(0, 1]', '(1, 2]', '(2, 3]', '(3, 4]', '(4, 6]', '(6, 11]', '(11, 399]'][::-1]})
helpful_votes
| category_1 | rating | found_helpful_bins | count | |
|---|---|---|---|---|
| 0 | Computers & Accessories | 5.0 | (0, 1] | 163 |
| 1 | Accessories & Supplies | 5.0 | (0, 1] | 106 |
| 2 | Computers & Accessories | 4.0 | (0, 1] | 88 |
| 3 | Camera & Photo | 5.0 | (0, 1] | 82 |
| 4 | Computers & Accessories | 5.0 | (1, 2] | 77 |
| ... | ... | ... | ... | ... |
| 290 | Home Audio | 1.0 | (2, 3] | 1 |
| 291 | Headphones | 4.0 | (3, 4] | 1 |
| 292 | Headphones | 3.0 | (4, 6] | 1 |
| 293 | Headphones | 2.0 | (11, 399] | 1 |
| 294 | Video Projectors | 5.0 | (11, 399] | 1 |
295 rows × 4 columns